if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ggtree")A Gentle Introduction to PCM
Libraries to Install and Load
Most are available on CRAN, but for {ggtree}, it may be necessary to to do the following:
library(tidyverse)
library(ape)
library(phytools)
library(ggtree)
library(broom)
library(nlme)The following packages are also also useful for PCM: {geiger}, {phangorn}, {treedata.table}
Plotting a Phylogeny
It is easy to load in phylogenetic trees saved as text files in Newick or Nexus format…
Newick format uses nested parentheses to represent the nesting of clades. Commas separate sister nodes or branches, and the entire tree is terminated with a semicolon. Branch lengths leading to a given node can be indicated following a colon.
Nexus format is a more comprehensive format that can include sequence alignments or character state data in addition to one or more phylogenetic trees in its “TREES” block. Within that block, trees are represented in Newick format.
Example 1 - Kuderna et al 2024
tree_file <- "Kuderna et al S4 fossil calibrated time tree.newick"
tree <- read.tree(tree_file)
is.rooted.phylo(tree)[1] TRUE
is.binary(tree)[1] TRUE
## quick plot
plot.phylo(tree, type = "cladogram", cex = 0.6)## or type = "cladogram" or "phylogram"
## plot with ggplot
p <- ggtree(tree, layout = "fan") +
geom_tiplab(aes(label=label), size=2)
pExample 2 - Lewis et al 2023
tree_file <- "Lewis et al figure S1 tree.nexus"
tree <- read.nexus(tree_file)
par(mar = c(2, 1, 1, 1)) # set bottom, left, top, and right margins
plot.phylo(tree, type = "phylogram", cex = 0.6, direction = "rightwards")
axisPhylo(backward = TRUE) # compare to lewis et al figure S1Example 3 - Springer et al 2015
tree_file <- "Springer et al S2 timetree with autocorrelated rates and hard-bounded constraints.newick"
tree <- read.tree(tree_file)
is.rooted.phylo(tree)[1] TRUE
is.binary(tree)[1] TRUE
# plot full tree
p <- ggtree(tree, layout = "fan") +
geom_tiplab(aes(label=label), size=2)
p# prune the tree to only species used by Lewis et al 2023
d <- read_csv("Lewis et al table S1.csv", col_names = TRUE)
taxa <- d$Species
pruned_tree <- drop.tip(tree, tip = setdiff(tree$tip.label, taxa))
# plot the pruned tree
p <- ggtree(pruned_tree, layout = "fan") +
geom_tiplab(aes(label=label), size = 2)
pplot.phylo(pruned_tree, type = "fan", cex = 0.6)Effect of Phylogenetic Structure
When we are interested in coevolution of traits or adaptive relationships of traits within a group of organisms, it is critical to recognize that closely related species may share trait values because of shared evolutionary history, NOT because of adaptation.
To explore this, let’s simulate random evolution of two metric characters on this tree and then look at the relationship between them using regression… (the values for a and sig2 in the code below are arbitrary, just to give us a wider range of trait values)
set.seed(25)
bodysize <- fastBM(pruned_tree, a = 100, mu = 0, sig2 = 100)
homerange <- fastBM(pruned_tree, a = 100, mu = 0, sig2 = 100)
d <- tibble(bodysize = bodysize, homerange = homerange)
ggplot(d, aes(x = bodysize, y= homerange)) +
geom_point() +
geom_smooth(method = "lm")# there is a negative association between these two characters... but they have evolved INDEPENDENTLY!
m <- lm(homerange ~ bodysize, d)
summary(m)
Call:
lm(formula = homerange ~ bodysize, data = d)
Residuals:
Min 1Q Median 3Q Max
-11.9093 -3.7477 -0.0137 2.9922 21.0532
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 146.2817 11.5066 12.71 < 2e-16 ***
bodysize -0.4602 0.1189 -3.87 0.000226 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.043 on 77 degrees of freedom
Multiple R-squared: 0.1628, Adjusted R-squared: 0.152
F-statistic: 14.98 on 1 and 77 DF, p-value: 0.0002263
set.seed(30)
bodysize <- fastBM(pruned_tree, a = 100, mu = 0, sig2 = 100)
homerange <- fastBM(pruned_tree, a = 100, mu = 0, sig2 = 100)
d <- tibble(bodysize = bodysize, homerange = homerange)
ggplot(d, aes(x = bodysize, y= homerange)) +
geom_point() +
geom_smooth(method = "lm")# there is a positive association between these two characters... but they have evolved INDEPENDENTLY!
m <- lm(homerange ~ bodysize, d)
summary(m)
Call:
lm(formula = homerange ~ bodysize, data = d)
Residuals:
Min 1Q Median 3Q Max
-20.7491 -4.4604 -0.2015 5.3040 11.1623
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 63.526 11.278 5.633 2.77e-07 ***
bodysize 0.371 0.110 3.373 0.00117 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.776 on 77 degrees of freedom
Multiple R-squared: 0.1287, Adjusted R-squared: 0.1174
F-statistic: 11.38 on 1 and 77 DF, p-value: 0.001166
How about if we do this MANY times? A simulation…
sim <- tibble(est = numeric(), p = numeric(), seed = numeric())
for (i in 1:100){
set.seed(i)
bodysize <- fastBM(pruned_tree, a = 100, mu = 0, sig2 = 100)
homerange <- fastBM(pruned_tree, a = 100, mu = 0, sig2 = 100)
d <- tibble(bodysize = bodysize, homerange = homerange)
m <- lm(homerange ~ bodysize, d)
est <- m |> tidy() |>
filter(term == "bodysize") |>
select(estimate) |>
pull()
p <- m |> tidy() |>
filter(term == "bodysize") |>
select(p.value) |>
pull()
res <- tibble(est = est, p = p, seed = i)
sim <- bind_rows(sim, res)
}
par(mar = c(2, 2, 2, 2))
hist(sim$est, breaks = seq(-1.2, 1.2, 0.2), main = "Histogram of coefficient estimates")hist(sim$p, breaks = seq(0,1, by = 0.05), main = "Histogram of p values")Takehome? Many simulations wind up with p values of less than 0.05 for traits that are evolving independently of one another, based just on the phylogenetic tree structure!
Let’s pull out the seed with the highest positive estimate from our simulation…
seed <- sim |>
arrange(desc(est)) |>
slice_head(n = 1) |>
pull(seed)
set.seed(seed)
bodysize <- fastBM(pruned_tree, a = 100, mu = 0, sig2 = 100)
homerange <- fastBM(pruned_tree, a = 100, mu = 0, sig2 = 100)
d <- tibble(bodysize = bodysize, homerange = homerange)
ggplot(d, aes(x = bodysize, y= homerange)) +
geom_point() +
geom_smooth(method = "lm")m <- lm(homerange ~ bodysize, d)
summary(m)
Call:
lm(formula = homerange ~ bodysize, data = d)
Residuals:
Min 1Q Median 3Q Max
-9.632 -3.872 -1.164 2.888 21.622
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.75200 9.43651 1.033 0.305
bodysize 0.93058 0.09641 9.652 6.68e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.241 on 77 degrees of freedom
Multiple R-squared: 0.5475, Adjusted R-squared: 0.5416
F-statistic: 93.17 on 1 and 77 DF, p-value: 6.683e-15
Note again that this steep positive correlation is due entirely to effects of phylogeny! Here, species data are not independent of one another…
Running a Phylogenetic Independent Contrasts Analysis
If we are interested in accounting for the effects of phylogeny on the interdependence of our data points, how might we control for this?
One way is to use CONTRASTS at each internal node, which are independent of one another. For a bifurcating tree with N taxa, the are N-1 independent contrasts.
Instead of use ordinary least square (OLS) regression where data points are each taxon’s trait values, we can run OLS where each observation is the CONTRAST at each internal node (and where we force the regression through the origin).
contrasts.bodysize <- pic(bodysize, pruned_tree) # use the pic function to calculate independent contrasts
contrasts.homerange <- pic(homerange, pruned_tree)
d <- tibble(contrasts.bodysize = contrasts.bodysize, contrasts.homerange = contrasts.homerange)
ggplot(d, aes(x = contrasts.bodysize, y = contrasts.homerange)) +
geom_point() +
geom_smooth(method = "lm")pic.m <- lm(contrasts.homerange ~ contrasts.bodysize + 0) # include the term + 0 to force the regression through the origin, i.e., to not calculate an intercept term
summary(pic.m)
Call:
lm(formula = contrasts.homerange ~ contrasts.bodysize + 0)
Residuals:
Min 1Q Median 3Q Max
-26.767 -5.761 1.338 7.796 24.958
Coefficients:
Estimate Std. Error t value Pr(>|t|)
contrasts.bodysize -0.02765 0.11404 -0.242 0.809
Residual standard error: 10.19 on 77 degrees of freedom
Multiple R-squared: 0.000763, Adjusted R-squared: -0.01221
F-statistic: 0.05879 on 1 and 77 DF, p-value: 0.8091
# there is no relationship between the contrasts...Takehome? It is critical to take phylogeny into account otherwise we risk concluding that adaptation or coevolution have occurred when the relationship seen between two traits of interest is due to phylogeny.
Phylogenetic Generalized Least Squares
The PIC method is fine for looking at two metric variables, but often we are interested in more than two traits and/or at traits that are non-metric. PGLS is more flexible approach. PIC is also just a specific application of PGLS.
First, we will do PIC on a new dataset and address the question, how is eye size related to body size in primates? This data set uses “Orbit_area” as a measure of eye size and “Skull_length” as a measure of body size.
d <- read_csv("primateEyes.csv", col_names = TRUE)
head(d)# A tibble: 6 × 7
Genus_species Group Skull_length Optic_foramen_area Orbit_area
<chr> <chr> <dbl> <dbl> <dbl>
1 Allenopithecus_nigroviridis Anthro… 98.5 7 299.
2 Alouatta_palliata Anthro… 110. 5.3 382.
3 Alouatta_seniculus Anthro… 108 8 359.
4 Aotus_trivirgatus Anthro… 60.5 3.1 297.
5 Arctocebus_aureus Prosim… 49.5 1.2 135.
6 Arctocebus_calabarensis Prosim… 53.8 1.6 157.
# ℹ 2 more variables: Activity_pattern <chr>, Activity_pattern_code <dbl>
tree <- read.tree("primateEyes.phy")
p <- ggtree(tree, layout = "fan") +
geom_tiplab(aes(label=label), size=2)
pggplot(d, aes(x = log(Skull_length), y=log(Orbit_area))) +
geom_point() +
geom_smooth(method = "lm")Ordinary least squares regression…
m <- lm(log(Orbit_area) ~ log(Skull_length), data = d)
summary(m)
Call:
lm(formula = log(Orbit_area) ~ log(Skull_length), data = d)
Residuals:
Min 1Q Median 3Q Max
-0.66781 -0.11317 -0.02423 0.14383 0.89835
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.10503 0.27496 0.382 0.703
log(Skull_length) 1.25911 0.06338 19.865 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2646 on 88 degrees of freedom
Multiple R-squared: 0.8177, Adjusted R-squared: 0.8156
F-statistic: 394.6 on 1 and 88 DF, p-value: < 2.2e-16
Now PIC…
orbit.area <- d$Orbit_area
names(orbit.area) <- d$Genus_species
skull.length <- d$Skull_length
names(skull.length) <- d$Genus_species
pic.orbit.area <- pic(log(orbit.area), tree)
pic.skull.length <- pic(log(skull.length), tree)
d <- tibble(contrasts.orbit.area = pic.orbit.area, contrasts.skull.length = pic.skull.length)
ggplot(d, aes(x = contrasts.skull.length, y= contrasts.orbit.area)) +
geom_point() +
geom_smooth(method = "lm")pic.m <- lm(pic.orbit.area ~ pic.skull.length + 0) # +0 to force the regression through the origin
summary(pic.m)
Call:
lm(formula = pic.orbit.area ~ pic.skull.length + 0)
Residuals:
Min 1Q Median 3Q Max
-0.103535 -0.023104 -0.004624 0.021021 0.175298
Coefficients:
Estimate Std. Error t value Pr(>|t|)
pic.skull.length 1.37867 0.07734 17.83 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.03738 on 88 degrees of freedom
Multiple R-squared: 0.7831, Adjusted R-squared: 0.7807
F-statistic: 317.8 on 1 and 88 DF, p-value: < 2.2e-16
For this dataset, a positive relationship between orbit size and bidy size remains when we use contrasts.
To now do PGLS, now, we need to convert our TREE to a correlation structure object.
There are different ways to do this, but one simple approach is to assume that the correlation between the residual errors of any pair of species is proportional to the distance on the tree back to the common ancestor of that pair. [There are alternative models to derive the correlation structure that imagine more complex forms of evolutionary change.]
# read in data again
d <- read_csv("primateEyes.csv", col_names = TRUE)
taxa <- d$Genus_species
corBM <- corBrownian(phy = tree, form = ~taxa)
corBMUninitialized correlation structure of class corBrownian
pgls.m <- gls(log(Orbit_area)~log(Skull_length), data = d, correlation = corBM) # note that we do not need to force the regression through the origin
summary(pgls.m)Generalized least squares fit by REML
Model: log(Orbit_area) ~ log(Skull_length)
Data: d
AIC BIC logLik
-79.58758 -72.15557 42.79379
Correlation Structure: corBrownian
Formula: ~taxa
Parameter estimate(s):
numeric(0)
Coefficients:
Value Std.Error t-value p-value
(Intercept) -0.2018553 0.3499265 -0.576851 0.5655
log(Skull_length) 1.3786743 0.0773418 17.825741 0.0000
Correlation:
(Intr)
log(Skull_length) -0.923
Standardized residuals:
Min Q1 Med Q3 Max
-2.5903639 -1.0543078 -0.7395119 -0.2535879 2.4063079
Residual standard error: 0.3193421
Degrees of freedom: 90 total; 88 residual
Compare the results for PGLS and PIC for this same dataset and analysis. Note that they yield the same results for the estimate and the p value. PIC is a specialized case of PGLS.
More generally, with PGLS, we can include additional covariates in our modeling.
For example, using this dataset, we can now see if eye size is related to both body size and activity pattern using, essentially, ANCOVA.
pgls.m <- gls(log(Orbit_area)~log(Skull_length) + Activity_pattern, data = d, correlation = corBM)
anova(pgls.m)Denom. DF: 86
numDF F-value p-value
(Intercept) 1 2017.1877 <.0001
log(Skull_length) 1 376.6683 <.0001
Activity_pattern 2 9.1575 2e-04
Based on the ANOVA table, when body size is taken into account, mean orbit size is different for different levels of activity pattern, and when activity pattern is taken into account, orbit size is positively related to body size.
We can visualize these patterns using the following code…
# create a grid of data to put into the pgls model to predict orbit size from body size and activity pattern...
newdata <- expand.grid(
Skull_length = seq(min(d$Skull_length), max(d$Skull_length), length.out = 100),
Activity_pattern = levels(as.factor(d$Activity_pattern))
)
newdata$prediction <- predict(pgls.m, newdata)And then plot the original data and regression lines for each level of activity pattern…
ggplot(d, aes(x = log(Skull_length), y = log(Orbit_area), color = Activity_pattern)) +
geom_point(size = 3) +
geom_line(data = newdata, aes(y = prediction), linewidth = 1) +
theme_minimal(base_size = 14) +
labs(
x = "log(Skull length)",
y = "log(Orbit area)",
color = "Activity pattern",
title = "PGLS results: Orbit area vs Skull length"
)Ancestral State Reconstruction
Let’s return to the Lewis et al data we looked at before and try to replicate their Ancestral State Reconstruction results.
tree_file <- "Springer et al S2 timetree with autocorrelated rates and hard-bounded constraints.newick"
tree <- read.tree(tree_file)
d <- read_csv("Lewis et al table S1.csv", col_names = TRUE)
taxa <- d$Species
d$Dominance <- as.factor(d$Dominance)
par(mar = c(2, 1, 1, 1))
# prune the tree to only species used by Lewis et al 2023
pruned_tree <- drop.tip(tree, tip = setdiff(tree$tip.label, taxa))
plot.phylo(pruned_tree, cex = 0.6)colors <- c("green", "blue", "red")
state_names <- c("codominant", "female", "male")
# plot the pruned tree with character states on tips
p <- ggtree(pruned_tree, layout = "fan") %<+% d +
geom_tippoint(aes(color = Dominance), size = 3) +
scale_color_manual(values = colors) +
geom_tiplab(aes(label=label), offset = 0.02, size=2)
pMk Model of Discrete Character Evolution
Dominant model for the evolution of discrete characters on a phylogeny is the Mk model (continuous time, discrete k, Markov chain model).
# define data to analyze
character_data <- d$Dominance
names(character_data) <- d$Species
tree <- pruned_tree
tree$node.label <- 1:Nnode(tree) + length(tree$tip.label)
# Run Mk Ancestral State Reconstruction ----
## equal rates model
fitER <- fitMk(tree, character_data, model = "ER")
## all rates different model
fitARD <- fitMk(tree, character_data, model = "ARD")
## symmetric rates model
fitSYM <- fitMk(tree, character_data, model = "SYM")
aov <- anova(fitER, fitARD, fitSYM) # compare models -- which best fits observed patterns of states at tips log(L) d.f. AIC weight
fitER -51.05972 1 104.1194 0.64700039
fitARD -49.40295 6 110.8059 0.02285398
fitSYM -49.73254 3 105.4651 0.33014563
bestMk <- rownames(aov[which.min(aov$AIC),])
Mk <- ancr(aov, type = "marginal", weighted = FALSE, tips = TRUE)
# with weighted = FALSE, ancr() uses the best supported model for ASR; with weighted = TRUE, models are weighted based on weight
Mk_probs <- Mk$ace # all nodes
Mk_node_pies <- Mk_probs[(length(tree$tip.label) + 1):(length(tree$tip.label) + tree$Nnode), ] # subset for just internal nodes
Mk_tip_pies <- Mk_probs[1:length(tree$tip.label), ] # subset for just tips
plot.phylo(tree, type = "cladogram", cex = 0.6, main = "Mk Model ASR")
#### add pie charts for ancestral states at internal nodes
nodelabels(
pie = Mk_node_pies,
piecol = colors,
cex = 0.4
)
legend("topleft",
legend = state_names,
fill = colors,
title = "Character States")#### add pie charts for tips
#tiplabels(
# pie = Mk_tip_pies,
# piecol = colors,
# cex = 0.4
#)Stochastic Character Mapping for ASR
#### Run SCM Ancestral State Reconstruction ----
nsim <- 100 # Number of stochastic maps to generate
scmER <- make.simmap(tree, character_data, model = "ER", nsim = nsim)make.simmap is sampling character histories conditioned on
the transition matrix
Q =
co-dominance female male
co-dominance -2.016977 1.008488 1.008488
female 1.008488 -2.016977 1.008488
male 1.008488 1.008488 -2.016977
(estimated using likelihood);
and (mean) root node prior probabilities
pi =
co-dominance female male
0.3333333 0.3333333 0.3333333
# or
# scmARD <- make.simmap(tree, character_data, model = "ARD", nsim = nsim)
# or
# scmSYM <- make.simmap(tree, character_data, model = "SYM", nsim = nsim)
#### summarize the stochastic maps
summary_scm <- summary(scmER)
#### extract posterior probabilities and pies for internal nodes and tips
scm_probs <- summary_scm$ace
scm_node_pies <- scm_probs[1:tree$Nnode, ]
scm_tip_pies <- summary_scm$tips
#### Plotting Results ----
plot.phylo(tree, type = "cladogram", cex = 0.6, main = "Stochastic Character Mapping ASR")
#### add pie charts for ancestral states at internal nodes
nodelabels(
pie = scm_node_pies,
piecol = colors,
cex = 0.4
)
legend("topleft",
legend = state_names,
fill = colors,
title = "Character States")#### add pie charts for tips
#tiplabels(
# pie = scm_tip_pies,
# piecol = colors,
# cex = 0.4
#)